# libraries
library(knitr)
library(timetk)
library(tidytext)
library(plotly)
library(ggwordcloud)
library(ggiraphExtra)
library(fs)
library(readr)
library(tidyverse)
library(tidymodels)
library(tidyquant)
library(recipes)
library(umap)
# Python integration 
library(reticulate)

Setup R’ spython interface (conda py3.8 Environment)

conda create -n py3.8 python=3.8 scikit-learn pandas numpy matplotlib This code does the following: - Creates a new Python environment called “py3.8” - Installs python version 3.8 - Installs the latest versions of scikit-learn, pandas, numpy, and matplotlib.

Replace this with your conda environment containing sklearn, pandas, & numpy

1. Data

retail_order_tbl <-  read_rds("00_Data/data_wranggled/trans_data/retail_order_tbl.rds")

products_manual_tbl <-  read_rds("00_Data/data_wranggled/product_data/products_manual_tbl.rds")

step_2_split_test <- read_rds("00_Data/data_split/step_2_split_test.rds")

step_2_split_train <- read_rds("00_Data/data_split/step_2_split_train.rds")
train_data_tbl   <- step_2_split_train$train
train_label_tbl  <- step_2_split_train$test

2. Plan Customer-Product (Unsupervised) Clustering

Problems: - Products are highly unique and has no categories - Need categories to learn which customers are similar in purchasing behaviour

Solution: - Price Feature Engineering will help group products by price categories - Text Feature Engineering is essential to compare description fields - Stock Code Engineering will group products with similar catalogue

Hypothesis: - Similar Products Embeds Similar Word Token - Similar Products Have Similar Price Variations - Similar Products Have Similar Stock Codes (digits)

Algorithm: - Kmeans: Distance-Based Metric Learning - DBSCAN: Density-Based Metric Learning

Evaluation: - Summarise cluster result of its key components + comparative analysis - Perform double-clustering the result with customer purchase behaviour statistics (RFM metric) to evaluate these product category roles.

3. Products Feature Engineering (Extrapolation)

3.1 Text Feature Engineering

– Use ‘tidytext’ to unnest tokens – Stems the tokens using ‘hunspell’ to return only the root of the word

products_dictionary_tbl <- products_manual_tbl %>% 
    select(stock_code, mode_description) %>% 
    unnest_tokens(terms, mode_description, token = "words") %>% 
    filter(!(nchar(terms) == 1)) %>% 
    mutate(terms = hunspell::hunspell_stem(terms)) %>% 
    unnest(terms) %>% 
    mutate(n = 1)

2.1.1 Term Frequency

– Count Frequency – Remove Stop words (e.g. “of” has no meaning) – Remove colour terms(e.g. “pink” has no meanings) – Remove terms with numbers (e.g. “130.5cm” has no meaning)

terms_frequency_tbl <- products_dictionary_tbl %>% 
    # Remove unnecessary terms 
    anti_join(stop_words, by = c("terms" = "word")) %>% 
    # Remove colour terms 
    filter(!terms %in% colours()) %>% 
    # Remove terms with numbers 
    filter(!terms %>% str_detect(pattern = "[0-9]")) %>% 
    # summarise -0> abstract frequency 
    group_by(terms) %>% 
    summarise(
        n = sum(n)
    ) %>% 
    arrange(desc(n))

2.1.2 Term Frequency Visualisation

Plan:

  • Simplify the algorithm by filtering top frequent terms
  • Eliminate uninformative descriptive terminology: – Cumulative Sum (frequency threshold) – Near Zero Variance (variance threshold)
source("00_Functions/Visualisation_function/visual.R")
  • Cumulative Plot – Flexible Filtering: Rank & Filter terms by Cumulative Sum – Cumulative Sum above 50% -> Relevant Term – Basis on the descending gradient of cumulative sum the threshold is selected
rank_term_frequency_tbl <- terms_frequency_tbl %>% 
  get_rank_cumulative(.cumulative_max = 0.5)

rank_term_frequency_tbl %>% plot_rank_cumulative(max_slice = 500)
  • Word Cloud Plot
terms_frequency_tbl %>% 
    slice(1:100) %>% 
    mutate(terms = as_factor(terms) %>% fct_rev()) %>% 
    ggplot() + 
    geom_text_wordcloud_area(aes(label = terms, size = n, colour = n)) +
    scale_size_area(max_size = 14) +
    scale_colour_viridis_c(direction = -1) +
    theme(plot.background = element_rect(fill = "black"),
          panel.background = element_rect(fill = "black"))

Fiter Term Frequency

top_50_terms <- terms_frequency_tbl %>% slice(1:50) %>% pull(terms)

product_term_feature_tbl <- products_dictionary_tbl %>% 
    filter(terms %in% top_50_terms) %>% 
    pivot_wider(
        names_from = terms,
        values_from = n,
        values_fill = list(n = 0),
        values_fn = list(n = sum)
    )

3.2 Price Feature Engineering

Problems: - Price Variations - Highly Skewed (Critical damage for distance based algorithm)

Solution: - Implement Simple Statistics of product’s Price Range - Flexible Transformation (Data Pre-processing) -> Box-Cox or Yeo-Johnson

product_price_range_tbl <- train_data_tbl %>% 
  group_by(stock_code) %>% 
  summarise(
    .groups     = "drop",
    n_prices    = n(),
    med_price   = median(unit_price) %>% round(2),
    mean_price  = median(unit_price) %>% round(2),
    min_price   = min(unit_price),
    p25_price   = quantile(unit_price)[2],
    p50_price   = quantile(unit_price)[3],
    p75_price   = quantile(unit_price)[4],
    max_price   = max(unit_price),
    range_price = ((max_price - min_price)/ mean_price) %>% round(2)
  ) 

3.2 Stock Code Feature Engineering

A stock code here will not be stock keeping unit. Depending on the type of inventory, retails include identifying information for everything from department to style, target customers, rountine, seasonal/occasional etc to identifies a product and helps business to track inventory for retail business.

product_code_tbl <- product_term_feature_tbl %>% 
  select(stock_code) %>% 
  mutate(code = str_replace(stock_code, "[a-zA-Z ]", ""))  %>% 
  separate(col = code,
           into = str_c("code_digit_", 6:1),
           sep = "",
           remove = FALSE) %>% 
  select(-code_digit_6) %>% 
  mutate_at(vars(starts_with("code_digit_")), funs(as.numeric)) %>%
  select(-code) 

Feature Preprocessing

Pre-processing steps Step_1: Remove near zero variance feature – This step will remove most of the tokenised term features with its variance below given frequency threshold Step_2: Yeo-Johnson generalised transformation will be operated to numeric price data with heavily skewness Step_3 & 4: kmeans & DBSCAN algorithm relies on the distance metrics (e.g. euclidean distance, density) hence it is important to center and scale the data

product_engineered_tbl <- product_term_feature_tbl %>% 
  left_join(product_code_tbl) %>% 
  left_join(product_price_range_tbl) 

skewed_feature_names <- product_engineered_tbl %>% 
  select(contains("price")) %>%
  colnames()

# Pre-processing steps for distance-based algorithms 
recipe_obj <- recipe(stock_code ~ ., product_engineered_tbl) %>% 
  step_nzv(all_predictors()) %>% 
  step_YeoJohnson(skewed_feature_names) %>% 
  step_center(all_numeric()) %>% 
  step_scale(all_numeric()) %>% 
  prep()

prep_product_cluster_tbl <- recipe_obj %>% juice() %>% 
  filter(complete.cases(.)) 

X_train <- prep_product_cluster_tbl %>% select(-stock_code) 
Y_label <- prep_product_cluster_tbl %>% select(stock_code) 

Modelling

– Kmeans Clustering – Skree Plot: To find the optimal k number of clusters – 2-D UMAP: For visualisation puropose

kmeans_mapper <- function(center = 3, data = X_train) {
    set.seed(42)
    data %>%
        kmeans(centers = center, nstart = 20)
}

centers_tbl <- tibble(centers = 1:20)

k_means_mapped_tbl <- centers_tbl %>% 
    mutate(k_means = centers %>% map(kmeans_mapper),
           glance = k_means %>% map(broom::glance))

– Optimal k-cluster ~ 10

plot_kmeans_skree <- function(kmeans_map_data, .metric = tot.withinss){
  
  metric_expr <- enquo(.metric)
  
  g <- kmeans_map_data %>% 
    unnest(glance) %>% 
    ggplot(aes(x = centers, y = !! metric_expr), colour = "#2c3e50") +
    geom_point(colour = "#2c3e50", size = 3) +
    geom_line(colour = "#2c3e50", size = 1) +
    ggrepel::geom_label_repel(aes(label = centers), colour = "#2c3e50", size = 4) +
    theme_tq()
  
  return(g)
  
}

k_means_mapped_tbl %>% plot_kmeans_skree()

UMAP – 2-dimensional Visualisation

umap_results <- X_train %>% 
    umap()

umap_results_tbl <- umap_results$layout %>% 
    as_tibble() %>% 
    setNames(c("x", "y")) %>% 
    cbind(X_train)
k_means_obj <- k_means_mapped_tbl %>% 
    filter(centers == 10) %>% 
    pull(k_means) %>%  pluck(1)

umap_kmeans_results_tbl <- k_means_obj %>% broom::augment(X_train) %>% bind_cols(Y_label) %>% 
    select(stock_code, .cluster) %>% 
    bind_cols(umap_results_tbl)

umap_kmeans_results_tbl %>% 
    ggplot(aes(x, y, colour = .cluster)) +
    geom_point(alpha = 0.5) +
    theme_tq() +
    scale_colour_tq()

kmeans Result Output + Save

kmeans_fed_terms <- umap_kmeans_results_tbl %>% 
  select(!contains("price")) %>% 
  select(!contains("digit")) %>% 
  select(-.cluster, -x, -y, -stock_code) %>% 
  colnames()

kmeans_10_result_tbl <- umap_kmeans_results_tbl %>%
  select(stock_code, .cluster, x, y) %>% 
  left_join(product_engineered_tbl %>% select(stock_code, kmeans_fed_terms)) %>% 
  left_join(product_engineered_tbl %>% select(stock_code, contains("price"))) %>% 
  left_join(product_engineered_tbl %>% select(stock_code, contains("digit"))) %>% 
  left_join(products_manual_tbl %>% select(-median_unit_price))
product_cluster_tbl <- kmeans_10_result_tbl

product_disctionary <- function(data,
                                .document = mode_description){
  document_expr <- enquo(.document)
  out_tbl <- data %>% 
      unnest_tokens(terms, !! document_expr, token = "words") %>% 
  mutate(terms = hunspell::hunspell_stem(terms)) %>% 
  unnest(terms) %>% 
  # Remove stop words 
  anti_join(stop_words, by = c("terms" = "word")) %>% 
  filter(!terms %in% colours()) %>% 
  filter(!terms %>% str_detect(pattern = "[0-9]")) %>% 
  
  mutate(n = 1)
  return(out_tbl)
}

term_freq_by_cluster_tbl <- product_cluster_tbl %>%
  product_disctionary() %>% 
  # Group by cluster & select top N terms
  group_by(.cluster, terms) %>% 
  summarise(n = sum(n)) %>% 
  arrange(desc(n), by_group = TRUE) %>% 
  slice(1:50)
g <- term_freq_by_cluster_tbl %>% 
  slice(1:10) %>% 
  ungroup() %>% 
  arrange(desc(n)) %>%
  mutate(terms = terms %>% as_factor %>% fct_reorder(n)) %>% 
  ggplot(aes(n, terms, colour = .cluster)) +
  geom_point() +
  expand_limits(x = 0) +
  facet_wrap(~.cluster, ncol = 5, scales = "free_y") +
  theme_tq() +
  scale_colour_tq() +
  labs(y = "") +
  theme(legend.position = "none")
  
ggplotly(g)
umap_kmeans_results_tbl <- read_rds("00_Data/data_engineered/umap_kmeans_results_tbl.rds")
umap_kmeans_results_tbl %>% 
  group_by(.cluster) %>% 
  summarise_at(vars(heart:range_price), .funs = mean) %>% 
  select(.cluster, heart:candle, med_price, mean_price, max_price, range_price, contains("digit_")) %>% 
  plot_facet_radar(colour = .cluster, .facet_vars = .cluster)

Save

– kmeans k = 10 (optimal observed by tot.withness) result table – Final Product Cluster Table

write_rds(umap_kmeans_results_tbl, "00_Data/data_engineered/umap_kmeans_results_tbl.rds")

write_rds(product_cluster_tbl, "00_Data/data_engineered/product_cluster_tbl.rds")

dump("product_disctionary", "00_Functions/Iterative_analysis/product_disctionary.R")